Introduction

Introduction

Oral cancer, a major subtype of head and neck cancers, remains one of the most aggressive malignancies globally. Oral squamous cell carcinoma (OSCC) often arises from premalignant lesions such as leukoplakia and progresses through intermediate pathological stages, including hyperkeratosis with no recurrence (HkNR) and dysplasia, before evolving into invasive cancer. Despite therapeutic advancements, early detection remains a challenge, contributing to a persistently low 5-year survival rate.

In this study, we analyze transcriptomic data from the GEO dataset GSE227919, which includes samples from healthy controls, dysplasia, HkNR, and cancer tissues. By performing differential expression analysis across these sequential stages—HkNR vs Control, Dysplasia vs HkNR, and Cancer vs Dysplasia—we aim to capture key transcriptional changes along the disease trajectory. Enrichment analyses, including g:Profiler and GSEA (Gene Set Enrichment Analysis), are employed to identify dysregulated pathways. This approach enables us to explore the molecular landscape of oral cancer progression and potentially highlight targets for early intervention and therapeutic development.

1. Load Packages and Data

library(tidyverse)
library(limma)
library(edgeR)
library(pheatmap)
library(plotly)
library(VennDiagram)
library(gprofiler2)
library(clusterProfiler)
library(msigdbr)
library(ggvenn)
library(DT)
library(knitr)
library(purrr)


expression_data <- read.table("project_data.txt", header = TRUE, row.names = 1) %>% as.matrix()
metadata <- read.csv("project_metadata.csv")

2. Preprocessing

common_samples <- intersect(colnames(expression_data), metadata$sample)
expr_mat <- expression_data[, common_samples]
meta4 <- metadata %>% filter(sample %in% common_samples) %>% arrange(match(sample, common_samples))
rownames(meta4) <- meta4$sample
meta4$Stage <- factor(meta4$group, levels = c("Control", "HkNR", "Dysplasia", "Cancer"), ordered = TRUE)
log_expr <- log2(expr_mat + 1)

3. PCA and Variance Plots

pca <- prcomp(t(log_expr), scale. = TRUE)
pca_df <- as.data.frame(pca$x)
pca_df$Stage <- meta4$Stage

ggplot(pca_df, aes(x = PC1, y = PC2, color = Stage)) +
  geom_point(size = 3) +
  theme_minimal() +
  labs(title = "PCA of Samples")

4. limma-voom Differential Expression

design4 <- model.matrix(~0 + Stage, data = meta4)
colnames(design4) <- levels(meta4$Stage)
dge <- DGEList(counts = expr_mat)
dge <- calcNormFactors(dge)
v <- voom(dge, design4, plot = FALSE)
fit <- lmFit(v, design4)

cont4 <- makeContrasts(
  HkNR_vs_Control = HkNR - Control,
  Dys_vs_HkNR    = Dysplasia - HkNR,
  Canc_vs_Dys   = Cancer - Dysplasia,
  Canc_vs_Control = Cancer - Control,
  levels = design4
)

fit2 <- contrasts.fit(fit, cont4) %>% eBayes()

de_HkNR_vs_Control <- topTable(fit2, coef = "HkNR_vs_Control", adjust.method = "BH", number = Inf) %>% rownames_to_column("geneID")
de_Dys_vs_HkNR    <- topTable(fit2, coef = "Dys_vs_HkNR", adjust.method = "BH", number = Inf) %>% rownames_to_column("geneID")
de_Canc_vs_Dys   <- topTable(fit2, coef = "Canc_vs_Dys", adjust.method = "BH", number = Inf) %>% rownames_to_column("geneID")
de_Canc_vs_Control   <- topTable(fit2, coef = "Canc_vs_Control", adjust.method = "BH", number = Inf) %>% rownames_to_column("geneID")

4A. Top DEG Tables

get_top_deg_table <- function(de_df, contrast_label) {
  de_df <- de_df %>%
    mutate(Direction = case_when(
      adj.P.Val < 0.05 & logFC > 1  ~ "Upregulated",
      adj.P.Val < 0.05 & logFC < -1 ~ "Downregulated",
      TRUE                          ~ "Not Significant"
    )) %>%
    filter(Direction %in% c("Upregulated", "Downregulated")) %>%
    arrange(desc(abs(logFC))) %>%
    slice(1:20)

  cat(paste0("### Top 20 DEGs: ", contrast_label, "\n\n"))
  datatable(de_df, options = list(pageLength = 10), rownames = FALSE)
}

get_top_deg_table(de_HkNR_vs_Control, "HkNR vs Control")

Top 20 DEGs: HkNR vs Control

get_top_deg_table(de_Dys_vs_HkNR, "Dysplasia vs HkNR")

Top 20 DEGs: Dysplasia vs HkNR

get_top_deg_table(de_Canc_vs_Dys, "Cancer vs Dysplasia")

Top 20 DEGs: Cancer vs Dysplasia

get_top_deg_table(de_Canc_vs_Control, "Cancer vs Control")

Top 20 DEGs: Cancer vs Control

5. Volcano Plots

plot_volcano <- function(df, title) {
  df <- df %>% mutate(logP = -log10(adj.P.Val + 1e-300),
                      Direction = case_when(
                        adj.P.Val < 0.05 & logFC > 1  ~ "Up",
                        adj.P.Val < 0.05 & logFC < -1 ~ "Down",
                        TRUE ~ "NS"))

  plot_ly(df, x = ~logFC, y = ~logP, color = ~Direction, colors = c("blue", "grey", "red"),
          text = ~paste("Gene:", geneID, "<br>logFC:", round(logFC,2), "<br>FDR:", signif(adj.P.Val, 3)),
          hoverinfo = "text", type = 'scatter', mode = 'markers') %>%
    layout(title = title, xaxis = list(title = "log2FC"), yaxis = list(title = "-log10(FDR)"))
}

plot_volcano(de_HkNR_vs_Control, "Volcano: HkNR vs Control")
plot_volcano(de_Dys_vs_HkNR,    "Volcano: Dysplasia vs HkNR")
plot_volcano(de_Canc_vs_Dys,   "Volcano: Cancer vs Dysplasia")
plot_volcano(de_Canc_vs_Control,   "Volcano: Cancer vs Control")

6. Venn Diagram of Significant Genes

genes1 <- de_HkNR_vs_Control %>% filter(adj.P.Val < 0.05) %>% pull(geneID)
genes2 <- de_Dys_vs_HkNR %>% filter(adj.P.Val < 0.05) %>% pull(geneID)
genes3 <- de_Canc_vs_Dys %>% filter(adj.P.Val < 0.05) %>% pull(geneID)

venn_list <- list(
  "HkNR vs Control" = genes1,
  "Dys vs HkNR"    = genes2,
  "Cancer vs Dys" = genes3
)

ggvenn(venn_list, fill_color = c("red", "green", "blue"), text_size = 4, set_name_size = 5)

7. Heatmap of Top DEGs

top_genes <- unique(c(
  de_HkNR_vs_Control %>% arrange(adj.P.Val) %>% slice(1:5) %>% pull(geneID),
  de_Dys_vs_HkNR %>% arrange(adj.P.Val) %>% slice(1:5) %>% pull(geneID),
  de_Canc_vs_Dys %>% arrange(adj.P.Val) %>% slice(1:5) %>% pull(geneID)
))

heat_mat <- log_expr[top_genes, ]          # expression of the pre‑selected genes
ord_samps <- meta4 %>% arrange(Stage) %>% pull(sample)
ann <- data.frame(Stage = meta4$Stage)
rownames(ann) <- meta4$sample

pheatmap(
  heat_mat[, ord_samps],
  annotation_col = ann[ord_samps, , drop = FALSE],
  cluster_rows   = TRUE,
  cluster_cols   = FALSE,
  show_rownames  = TRUE,
  fontsize_row   = 6,
  main           = "Top DEGs Across All Stages"
)

8. GO Enrichment (g:Profiler)

sig_genes <- genes3
gostres <- gost(query = sig_genes, organism = "hsapiens", correction_method = "fdr")
gostplot(gostres, interactive = TRUE, capped = TRUE)

9. Gene Expression Trajectories

expr_long <- log_expr[top_genes, ] %>% as.data.frame() %>%
  rownames_to_column("Gene") %>%
  pivot_longer(-Gene, names_to = "sample", values_to = "logExpr") %>%
  left_join(meta4, by = "sample")

avg_expr <- expr_long %>% group_by(Gene, Stage) %>% summarise(meanExpr = mean(logExpr), .groups = "drop")

ggplot(avg_expr, aes(x = Stage, y = meanExpr, group = 1)) +
  geom_line(color = "steelblue", size = 1) +
  geom_point(size = 2) +
  facet_wrap(~ Gene, scales = "free_y") +
  theme_minimal(base_size = 13) +
  labs(title = "Gene-wise Expression Trajectories", y = "Mean log2(Expression + 1)", x = "Stage")

10. GSEA (MSigDB C2)

library(BiocParallel)
BPPARAM <- SerialParam()  # Single line fix for parallel issues

hs_c2 <- msigdbr(species = "Homo sapiens", category = "C2") %>%
  select(gs_name, gene_symbol)

rank_and_run_gsea <- function(de_df, label) {
  ranked <- deframe(de_df[, c("geneID", "logFC")]) %>% sort(decreasing = TRUE)
  GSEA(ranked, TERM2GENE = hs_c2, verbose = FALSE, BPPARAM = BPPARAM)@result %>%
    transmute(ID, Description, NES, p.adjust, Stage = label)
}

df_gsea <- bind_rows(
  rank_and_run_gsea(de_HkNR_vs_Control, "HkNR vs Control"),
  rank_and_run_gsea(de_Dys_vs_HkNR,    "Dysplasia vs HkNR"),
  rank_and_run_gsea(de_Canc_vs_Dys,   "Cancer vs Dysplasia")
)

top_terms <- df_gsea %>%
  filter(p.adjust < 0.05) %>%
  group_by(ID) %>%
  summarise(mean_NES = mean(abs(NES)), .groups = "drop") %>%
  slice_max(mean_NES, n = 10) %>%
  pull(ID)

ggplot(df_gsea %>% filter(ID %in% top_terms),
       aes(x = Stage, y = NES, group = ID, color = ID)) +
  geom_line(size = 1) + geom_point(size = 2) +
  theme_minimal(base_size = 13) +
  labs(title = "GSEA: NES Trajectories Across Stages", x = "Comparison", y = "NES") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))